Python to R translation

Author

Cleyton Farias

Python Setup
# import libraries and define global settings
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt

# The code below define global figure properties used for publication.
# import matplotlib_inline.backend_inline
# matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format
plt.rcParams.update(
  {'font.size':14,             # font size
  'savefig.dpi':300,          # output resolution
  'axes.titlelocation':'left',# title location
  'axes.spines.right':False,  # remove axis bounding box
  'axes.spines.top':False,    # remove axis bounding box
  }
)

# setting seed
np.random.seed(10)
R Setup
# Attaching packages
library(dplyr)
library(glue)
library(stringr)
library(ggplot2)
library(patchwork)

# setting seed
set.seed(10)

Figure 7.2: What to look for in visual inspection of data

Python version:

Code
_,axs = plt.subplots(2,2,figsize=(12,6))

# panel A: unexpected range
xA = np.concatenate((np.random.randn(20),np.random.randn(80)*30),axis=0)
axs[0,0].plot(xA,'ks',markersize=10,markerfacecolor=(.7,.7,.7),alpha=.8)
axs[0,0].set(xlabel='Data index',xticks=[],yticks=[],ylabel='Data value')
axs[0,0].set_title(r'$\bf{A}$)  Unexpected data range')

# panel B: distribution shape
xB = np.concatenate((5+np.random.randn(150),np.exp(1+np.random.randn(150))),axis=0)
axs[0,1].hist(xB, bins='fd',edgecolor='k',facecolor=(.7,.7,.7))
axs[0,1].set(xlabel='Data value',xticks=[],yticks=[],ylabel='Count')
axs[0,1].set_title(r'$\bf{B}$)  Nonstandard distribution')

# panel C: mixed datasets
xC = np.concatenate((4+np.random.randn(150),np.random.randn(150)-4),axis=0)
axs[1,0].hist(xC,bins=50,edgecolor='k',facecolor=(.7,.7,.7))
axs[1,0].set(xlabel='Data value',xticks=[],yticks=[],ylabel='Count')
axs[1,0].set_title(r'$\bf{C}$)  Mixed dataset')

# panel D: outliers
xD = np.random.randn(150)
xD[60] = 10
xD[84] = 14
axs[1,1].plot(xD,'ks',markersize=10,markerfacecolor=(.7,.7,.7),alpha=.8)
axs[1,1].set(xlabel='Data index',xticks=[],yticks=[],ylabel='Data value')
axs[1,1].set_title(r'$\bf{B}$)  Outliers')

# export
plt.tight_layout()
plt.savefig('dataQC_qualInspection.png')
plt.show()

R version:

Code
# panel A: unexpected range
xA=c(rnorm(20), rnorm(80)*80)
data_A = tibble(data_index=1:length(xA), data_value = xA)

pA = ggplot(data_A, aes(x=data_index, y = data_value)) +
  geom_point(
    shape=22,
    size=5,
    fill = "gray",
    alpha=.8
  ) + 
  theme(
      panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank(),
      panel.background = element_blank(), 
      axis.line = element_line(colour = "black"),
      axis.ticks.x = element_blank(),
      axis.text.x = element_blank(),
      axis.title = element_text(size=15),
      title = element_text(size=15)
  ) + 
  labs(title = bquote(bold("A)")~"Unexpected data range"),
       y = "Data value", x = "Data index")

# panel B: distribution shape
xB = c(5+rnorm(150), exp(1+rnorm(150)))
data_B = tibble(data_index=1:length(xB), data_value=xB)

pB = ggplot(data_B, aes(x=data_value))+
    geom_histogram(color="black", fill="gray", bins=50) + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.title = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(title = bquote(bold("B)")~"Nonstandard distribution"),
         y = "Count", x = "Data value")

# panel C: mixed datasets
xC = c(4+rnorm(150), rnorm(150)-4)
data_C = tibble(data_index = 1:length(xC), data_value = xC)

pC = ggplot(data_C, aes(x=data_value))+
    geom_histogram(color="black", fill="gray") + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.title = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(title = bquote(bold("C)")~"Mixed dataset"),
         y = "Count", x = "Data value")

# panel D: outliers
xD = rnorm(150)
data_D = tibble(data_index=1:length(xD), data_value=xD)
data_D[60, "data_value"] = 10
data_D[84, "data_value"] = 14

pD = ggplot(data_D, aes(x = 1:nrow(data_D), y = data_value)) +
    geom_point(
        shape=22,
        size=5,
        fill = "gray",
        alpha=.8
    ) + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.title = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(title = bquote(bold("D)")~"Outliers"),
         y = "Data value", x = "Data index")


p7.2 = pA + pB + pC + pD
p7.2

Figure 7.4: Example of dataset with outliers

Python version:

Code
# Create normally distributed data
N = 100
data = np.random.randn(N)

# and add two random outliers in random positions
data[np.random.choice(np.arange(N),2)] = np.random.uniform(2,3,2)**2

# and plot
plt.figure(figsize=(8,4))
plt.plot(data,'ks',markersize=10,markerfacecolor=(.7,.7,.7))
plt.xlim([-2,N+1])
plt.xlabel('Data index')
plt.ylabel('Data value')

plt.tight_layout()
plt.show()

R version:

Code
# Create normally distributed data
N = 100
y = rnorm(N)
data = tibble(data_index = 1:length(y), data_value = y)

# and add two random outliers in random positiions
data[sample(N, 2), "data_value"] = runif(2, min=2, max=3)^2

# and plot:
p7.4 = ggplot(data, aes(x = data_index, y = data_value)) + 
    geom_point(
        shape=22, 
        size=5,
        fill = "gray",
        alpha=.8
    ) + 
    xlim(-2, N+1) + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) +
    labs(y = "Data value", x = "Data index")

p7.4

Figure 7.6: Z-score method for identifying outliers

Python version:

Code
# outlier threshold
zThreshold = 3.09

# create some raw data
N = 135
data = np.exp(np.random.randn(N)/2) + 5

# zscore the data
dataZ = (data-np.mean(data)) / np.std(data,ddof=1)

# identify data indices containing outliers
outliers = np.where(np.abs(dataZ)>zThreshold)[0]

# and plot
_,axs = plt.subplots(1,2,figsize=(10,4))
axs[0].plot(data,'ks',markersize=10,markerfacecolor=(.7,.7,.7))
axs[0].set(xlim=[-2,N+1],xlabel='Data index',ylabel='Data value')
axs[0].set_title(r'$\bf{A}$)  Original data')


axs[1].plot(dataZ,'ks',markersize=10,markerfacecolor=(.9,.9,.9))
axs[1].axhline(zThreshold,linestyle='--',color=(.9,.9,.9))
axs[1].plot(outliers,dataZ[outliers],'kx',markersize=10,markeredgewidth=2)
axs[1].set(xlim=[-3,N+2],xlabel='Data index',ylabel='Transformed data value')
axs[1].set_title(r'$\bf{B}$)  Z-transformed data')

plt.tight_layout()
plt.show()

R version:

Code
# outlier threshold
zThreshold = 3.09

# create some raw data
N = 135
data = exp(rnorm(N)/2) + 5

# zscore the data:
dataZ = (data - mean(data))/sd(data) # sd uses n-1 dof

# identify data indices containing outliers
outliers = abs(dataZ) > zThreshold

# and plot
data = tibble(data_index = 1:length(data), data_value = data)
p1 = ggplot(data, aes(x = data_index, y = data_value)) + 
    geom_point(
        shape=22, 
        size=5,
        fill = "#969696",
        alpha=.8
    ) + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(title = bquote(bold("A)")~"Original data"),
         y = "Data value", x = "Data index")


dataZ = tibble(data_index = 1:length(dataZ), data_value = dataZ)
p2 = ggplot(dataZ, aes(x=data_index, y = data_value)) + 
    geom_point(
          shape=22, 
          size=5,
          fill = "gray",
          alpha=.8
    ) + 
    geom_point(
        data = dataZ[outliers,],
        aes(x = data_index, y=data_value),
        shape=7,
        stroke=1,
        size=3,
        fill = "gray",
        alpha=.8
    ) +
    geom_hline(
        yintercept = zThreshold, 
        linetype="dashed", 
        linewidth=.7,
        color="gray",
    ) + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(title = bquote(bold("A)")~"Z-transformed data"),
         y = "Transformed data value", x = "Data index")

p7.6 = p1 + p2
p7.6

Figure 7.7: Impact of removing outliers on z-values

Python version:

Code
# create some raw data
N = 10 # sample size
data = np.exp(np.random.randn(N)/2) + 5
data[-1] = np.max(data)+2 # impose an outlier (at the end for convenience)
xvals = np.arange(N)

dataZ1 = (data-np.mean(data)) / np.std(data,ddof=1)
dataZ2 = (data[:-1]-np.mean(data[:-1])) / np.std(data[:-1],ddof=1)

_,axs = plt.subplots(1,2,figsize=(10,4))
axs[0].plot(xvals,data,'ks',markersize=10,markerfacecolor=(.7,.7,.7))
axs[0].set(xticks=[],xlabel='Data index',ylabel='Raw data value')
axs[0].set_title(r'$\bf{A}$)  Raw data')

axs[1].plot(xvals,dataZ1,'ks',markersize=10,markerfacecolor=(.7,.7,.7),label='Z with outlier')
axs[1].plot(xvals[:-1],dataZ2,'ko',markersize=10,markerfacecolor=(.5,.5,.5),label='Z without outlier')
axs[1].set(xticks=[],xlabel='Data index',ylabel='Transformed data value')
axs[1].legend()
axs[1].set_title(r'$\bf{B}$)  Z-transformed data')

# draw lines connection pre/post-removal values
for d,z,x in zip(dataZ1[:-1],dataZ2,xvals[:-1]):
  axs[1].plot([x,x],[d,z],':',color=(.7,.7,.7),zorder=-10)

plt.tight_layout()
plt.show()

R version:

Code
# create some raw data
N = 10 # sample size
data = exp(rnorm(N)/2) + 5
data[N] = max(data) + 2 # impose an outlier (at the end for convenience)

dataZ1 = (data - mean(data)) / sd(data) # sd uses n-1 dof
dataZ2 = (data[-N] - mean(data[-N])) / sd(data[-N]) # sd uses n-1 dof

# and plot:
data = tibble(data_index = 1:length(data), data_value = data)
pA = ggplot(data, aes(x = data_index, y = data_value)) +
    geom_point(
        shape=22, 
        size=5,
        fill = "gray"
    ) + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(title = bquote(bold("A)")~"Raw data"),
         y = "Raw data value", x = "Data index")


dataZ1 = tibble(data_index = 1:length(dataZ1), data_value = dataZ1, id="Z with outlier")
dataZ2 = tibble(data_index = 1:length(dataZ2), data_value = dataZ2, id="Z without outlier")
dataZ = rows_append(dataZ1, dataZ2)

pB = ggplot(dataZ, 
       aes(x = data_index, y=data_value, 
           group = data_index,
           shape=id,
           fill=id)) + 
  geom_line(
        linetype="dashed", 
        linewidth=.3
    ) +
    geom_point(
        size=5
    ) + 
    
    scale_shape_manual(values=c("Z with outlier" = 22,
                                "Z without outlier" = 21)) + 
    scale_fill_manual(values=c("Z with outlier" = "#bdbdbd",
                               "Z without outlier" = "#525252")) + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        legend.position = c(.8,.9),
        legend.title = element_blank(),
        legend.box.background = element_rect(color="gray", linewidth = 1),
        legend.text = element_text(size = 12),
        legend.key = element_blank(),
        legend.key.width = unit(1, "cm"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(shape="", fill="",
         title = bquote(bold("B)")~"Z-transformed data"),
         y = "Transformed data value", x = "Data index")

p7.7 = pA + pB
p7.7

Figure 7.9: Data trimming

Python version:

Code
N = 74
data = np.random.randn(N)**3

# find largest and smallest values
k = 2
sortidx = np.argsort(data)
minvals = sortidx[:k]
maxvals = sortidx[-k:]

_,axs = plt.subplots(2,1,figsize=(8,6))
axs[0].plot(data,'ks',markersize=10,markerfacecolor=(.9,.9,.9))
axs[0].plot(minvals,data[minvals],'kx',markersize=10,markeredgewidth=2)
axs[0].plot(maxvals,data[maxvals],'kx',markersize=10,markeredgewidth=2)
axs[0].set_title(r'$\bf{A}$)  Data with k-extreme points trimmed')


# create a Gaussian probability curve for the panel B
x = np.linspace(-4,4,401)
gpdf = stats.norm.pdf(x)

# the find the indices of the 2.5% and 97.5%
lbndi = np.argmin(np.abs(x-stats.norm.ppf(.025))) # lbndi = Lower BouND Index
ubndi = np.argmin(np.abs(x-stats.norm.ppf(1-.025)))


# plot the probability function and the vertical lines
axs[1].plot(x,gpdf,'k',linewidth=2)
axs[1].axvline(x[lbndi],color=(.5,.5,.5),linewidth=.5,linestyle='--')
axs[1].axvline(x[ubndi],color=(.5,.5,.5),linewidth=.5,linestyle='--')
axs[1].set(xlim=x[[0,-1]],ylim=[0,.42])
axs[1].set_title(r'$\bf{B}$)  Histogram showing trimmed areas')

# now create patches for the rejected area
axs[1].fill_between(x[:lbndi+1],gpdf[:lbndi+1],color='k',alpha=.4)
axs[1].fill_between(x[ubndi:],gpdf[ubndi:],color='k',alpha=.4)


# and save
plt.tight_layout()
plt.show()

R Version:

Code
N = 74
y = rnorm(N)^3
data = tibble(data_index=1:length(y), data_value=y)

# find largest and smallest values
k = 2
# sorting ascending order:
data = arrange(data, data_value)
largest = head(data, k)
smallest = tail(data, k)

pA = ggplot(data, aes(x = data_index, y = data_value)) + 
      geom_point(
        shape=22, 
        size=5,
        fill = "gray"
    ) + 
    geom_point(
        data = largest,
        aes(x = data_index, y = data_value),
        shape=7,
        size=5,
        stroke=1,
        fill = "gray",
        alpha=.8
    ) + 
    geom_point(
        data = smallest,
        aes(x = data_index, y = data_value),
        shape=7,
        size=5,
        stroke=1,
        fill = "gray",
        alpha=.8
    ) + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(title = bquote(bold("A)")~"Data with k-extreme points trimmed"),
         y = "", x = "")

# create a Gaussian probability curve for the panel B
x = seq(-4,4,length=401)
gpdf = dnorm(x, mean = 0, sd = 1)
data = tibble(data_index = x, data_value = gpdf)

# find the indices of the 2.5% and 97.5%
lbndi = which.min(abs(x - qnorm(0.025))) # lower bound index
ubndi = which.min(abs(x - qnorm(0.975))) # upper bound index

# plot the probability function and the vertical lines
pB = ggplot(data = data, aes(x = data_index, y = data_value)) + 
    geom_line() + 
    geom_vline(xintercept = x[lbndi], linetype="dashed") + 
    geom_ribbon(
        data = filter(data, data_index <= x[lbndi]), 
        aes(ymax = data_value, ymin=0),
        fill="#969696") + 
     geom_vline(xintercept = x[ubndi], linetype="dashed") + 
     geom_ribbon(
        data = filter(data, data_index >= x[ubndi]), 
        aes(ymax = data_value, ymin=0),
        fill="#969696") + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(title = bquote(bold("B)")~"Histogram showing trimmed areas"),
         y = "", x = "")

p7.9 = pA + pB + plot_layout(ncol=1)
p7.9

Exercise 1

Python version:

Code
## iterative method
# Note about this code: Because of random numbers, you are not guaranteed to get a result
# that highlights the method. Try running the code several times.

N = 30
data = np.random.randn(N)
data[data<-1] = data[data<-1]+2
data[data>1.5] = data[data>1.5]**2; # try to force a few outliers


# pick a lenient threshold just for illustration
zscorethresh = 2
dataZ = (data-np.mean(data)) / np.std(data,ddof=1)

plt.figure(figsize=(10,4))

colorz = 'brkmc'
numiters = 0 # iteration counter
while True:

  # convert to z
  datamean = np.nanmean(dataZ)
  datastd  = np.nanstd(dataZ,ddof=1)
  dataZ = (dataZ-datamean) / datastd

  # find data values to remove
  toremove = dataZ>zscorethresh

  # break out of while loop if no points to remove
  if sum(toremove)==0:
    break
  else:
    # otherwise, mark the outliers in the plot
    plt.plot(np.where(toremove)[0]+numiters/5,dataZ[toremove],'%sx'%colorz[numiters],
             markersize=12,markeredgewidth=3)
    dataZ[toremove] = np.nan

  # replot
  plt.plot(np.arange(N)+numiters/5,dataZ,linestyle='None',marker=f'${numiters}$',markersize=12,
           color=colorz[numiters])

  # update counter
  numiters = numiters + 1


plt.ylabel('Z-score')
plt.xlabel('Data index')

plt.tight_layout()
plt.show()

R version:

Some notes: if I understand correctly, you want to simulate some data, add some outliers and calculate the z-scores and removing them to see the behavior of the dataset.

In each iteration, you want to find all outliers, plot the all the datapoints, and mark the outliers to remove them in the next round.

If that’s the case, I take the liberty to reorganize the code and do the following:

  • Commented the part of the original data does not get z-scored twice in the first iteration;
  • Reorganized the replot section in the for-loop so that the data points are plotted even when no outlier is found. This will prevent the case of no-plotting when no outlier is found in the first iteration.

Please, let me know if these changes make sense. In any case, I can follow strictly to the python code.

Code
## iterative method
# Note about this code: Because of random numbers, you are not guaranteed to get a result
# that highlights the method. Try running the code several times.

# Simulate the data:
N = 30
data = rnorm(N)
data[data < -1] = data[data < -1] + 2
data[data > 1.5] = data[ data > 1.5]^2

# pick a lenient threshold just for illustration
zscorethresh = 2

# ======================================================= # 
# if this part is included, then the first iteration will be 
# z-scored two times.
# dataZ = (data-mean(data)) / sd(data)

# color pallete:
colorz = c("blue", "black", "red", "magenta", "cyan")

# start the skeleton of the plot:
p = ggplot() + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(y = "Z-score", x = "Data index")

# initiate counter:
numiters = 1

# initiate the dataZ with original data:
dataZ = data

while(TRUE) {
    # convert to z-score:
    datamean = mean(dataZ, na.rm = T)
    datastd = sd(dataZ, na.rm = T)
    dataZ = (dataZ - datamean)/datastd
    
    # find data values to remoove:
    toRemove = dataZ > zscorethresh
    
    # plot points in the current iteration:
    nmb = numiters
    color_ = colorz[nmb]
    data_temp = tibble(data_index = 1:N + numiters/5, data_value = dataZ)
    p <- p +
    geom_text(
        data = data_temp,
        aes(x = data_index, y = data_value),
        label = nmb,
        size = 5,
        color = color_
    )
    
    # break out of the while loop if no points to remove:
    if (sum(toRemove, na.rm = T) == 0) {
        break
    } else{
        # otherwise, mark the outliers in the plot for the current iteration:
        outlier_data = tibble(
            data_index = which(toRemove) + numiters/5,
            data_value = dataZ[toRemove]
        )
        color_ = colorz[numiters]
        p <- p + 
            geom_point(
                data = outlier_data,
                aes(x=data_index, y=data_value),
                shape = 4,
                color = color_,
                size=2,
                stroke=1
            )
        dataZ[toRemove] = NaN
    }
    
    # update counter:
    numiters <- numiters + 1
}

p

Exercise 2

Python version

Code
# create data
N = 10000
Y = np.exp(np.sin(np.random.randn(N)))

# make a copy of the data to manipulate
Yc = Y.copy()

# not specified in the instructions, but always a good idea to inspect the data!
plt.hist(Y,bins=40)
plt.show()

Code
# percent to remove (two-tailed)
k = 4

# convert that to a number of data points to remove from each tail
pnts2nan = int( (k/2)/100 * N ) # with stated parameters, this should be 200

# find the data sorting
sort_idx = np.argsort(Y)

# nan the two tails separately
Yc[sort_idx[:pnts2nan]]  = np.nan
Yc[sort_idx[-pnts2nan:]] = np.nan

# confirm the right numbers of points
print(f'Total dataset size: {len(Yc)}')
print(f'Valid dataset size: {np.sum(~np.isnan(Yc))}')
Code
# compute the mean and median (also used in the next exercise)
meanY = np.mean(Y)
medianY = np.median(Y)

# print the means
print(f'Mean of original: {meanY:.3f}')
print(f'Mean of trimmed:  {np.nanmean(Yc):.3f}')

# print the medians
print(' ')
print(f'Median of original: {medianY:.3f}')
print(f'Median of trimmed:  {np.nanmedian(Yc):.3f}')

R version:

Code
# create data
N = 10000
Y = exp(sin(rnorm(N)))

# make a copy of the data to manipulate:
data = tibble(data_index = 1:length(Y), data_value = Y)
ggplot(data, aes(x = data_value)) + 
    geom_histogram(bins = 40, fill="#2c7fb8", color="white") + 
     theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(y = "", x = "")

Code
# percent to remove (two-tailed)
k = 4 

# finding threshold in the distribution:
q = k/2 # percent from each side
thresh = quantile(data$data_value, c(q/100, 1-q/100))

# data with two-tailed points removed:
data_twoTailed_rmvd = data %>% 
    filter(data_value > thresh[1] & data_value < thresh[2])

# confirm the right number of points:
print(glue("Total dataset size: {nrow(data)}"))
Total dataset size: 10000
Code
print(glue("Total dataset size: {nrow(data_twoTailed_rmvd)}"))
Total dataset size: 9600
Code
# compute the mean and median (also used in the next exercise)
mean_Y = mean(data$data_value)
median_Y = median(data$data_value)

mean_Ytrimmed = mean(data_twoTailed_rmvd$data_value)
median_Ytrimmed = median(data_twoTailed_rmvd$data_value)

# print the means
print(glue("Mean of original: {sprintf('%.3f', mean_Y)}"))
Mean of original: 1,223
Code
print(glue("Mean of trimmed: {sprintf('%.3f', mean_Ytrimmed)}"))
Mean of trimmed: 1,210
Code
# print the medians
print(glue("Median of original: {sprintf('%.3f', median_Y)}"))
Median of original: 0,987
Code
print(glue("Median of trimmed: {sprintf('%.3f', median_Ytrimmed)}"))
Median of trimmed: 0,987

Exercise 3

Python version:

Code
# the range of k values
ks = np.arange(1,50,3)

# initialize a results matrix for mean/median
results = np.zeros((len(ks),2))


# the experiment!
for idx,ki in enumerate(ks):

  # make a new copy of the original data
  Yc = Y.copy() # Note: Y was defined in Exercise 2

  # convert that to a number of data points to remove from each tail
  pnts2nan = int( (ki/2)/100 * N )

  # nan the two tails separately
  Yc[sort_idx[:pnts2nan]]  = np.nan
  Yc[sort_idx[-pnts2nan:]] = np.nan

  # collect mean and median
  results[idx,0] = 100*(np.nanmean(Yc)-meanY) / meanY
  results[idx,1] = 100*(np.nanmedian(Yc)-medianY) / medianY

  print(f'Total/valid dataset size: {len(Yc)} -> {np.sum(~np.isnan(Yc))}')
Code
# plot the results
plt.figure(figsize=(8,4))
plt.plot(ks,results[:,0],'s-',color=[.6,.6,.6],markerfacecolor=[.8,.8,.8],markersize=10,label='Mean')
plt.plot(ks,results[:,1],'o-',color='k',markerfacecolor=[.4,.4,.4],markersize=10,label='Median')
plt.legend()
plt.xlabel('k% to trim')
plt.ylabel(r'Descriptive value (%$\Delta$)')

plt.tight_layout()
plt.show()

R version

Code
# the range of k values
ks = seq(1, 50, 3)

# initialize a results matrix for mean/median
results = matrix(0, length(ks), 2)

# the experiment!
for (idx in seq_along(ks)) {
    #idx = 1
    # convert that to a number of data points to remove from each tail:
    q = ks[idx]/2
    # get the thresholds:
    thresh = quantile(data$data_value, c(q/100, 1-q/100))
    
    # get the remaining data points:
    data_temp = data %>% 
        filter(data_value > thresh[1] & data_value < thresh[2])
    # collect mean and median and calculate % difference from original ones:
    results[idx, 1] = 100*(mean(data_temp$data_value) - mean(data$data_value)) / mean(data$data_value)
    results[idx, 2] = 100*(median(data_temp$data_value) - median(data$data_value)) / median(data$data_value)
    
    print(glue("Total/valid dataset size: {nrow(data)} -> {nrow(data_temp)}"))
}
Total/valid dataset size: 10000 -> 9900
Total/valid dataset size: 10000 -> 9600
Total/valid dataset size: 10000 -> 9300
Total/valid dataset size: 10000 -> 9000
Total/valid dataset size: 10000 -> 8700
Total/valid dataset size: 10000 -> 8400
Total/valid dataset size: 10000 -> 8100
Total/valid dataset size: 10000 -> 7800
Total/valid dataset size: 10000 -> 7500
Total/valid dataset size: 10000 -> 7200
Total/valid dataset size: 10000 -> 6900
Total/valid dataset size: 10000 -> 6600
Total/valid dataset size: 10000 -> 6300
Total/valid dataset size: 10000 -> 6000
Total/valid dataset size: 10000 -> 5700
Total/valid dataset size: 10000 -> 5400
Total/valid dataset size: 10000 -> 5100
Code
ggplot(tibble(data_index = ks, mean = results[,1], median = results[,2])) + 
    geom_point(
        aes(x = data_index, y = mean, 
            color = "Mean",
            fill = "Mean",
            shape = "Mean"
            ),
        size=4
    ) + 
    geom_line(aes(x = data_index, y = mean, color = "Mean")) + 
    geom_point(
        aes(x = data_index, y = median,
            color = "Median", 
            fill = "Median", 
            shape = "Median"),
        size=4
    ) + 
    geom_line(aes(x = data_index, y = median, color="Median")) +
    scale_shape_manual(
        values = c("Mean" = 22,
                   "Median" = 21)
    ) + 
    scale_color_manual(
        values = c("Mean" = "#bdbdbd",
                   "Median" = "#525252")
    ) + 
    scale_fill_manual(
        values = c("Mean" = "#bdbdbd",
                   "Median" = "#525252")
    ) +
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        legend.position = c(.1,.1),
        legend.title = element_blank(),
        legend.box.background = element_rect(color="gray", linewidth = 1),
        legend.text = element_text(size = 12),
        legend.key = element_blank(),
        legend.key.width = unit(1, "cm"),
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(fill= "", color = "", shape = "", 
         y = expression(paste("Descriptive value (%",Delta,")")), x = "k% to trim")

Exercise 4

Python version:

Code
# create data
N = 1000
X = np.random.f(5,100,size=N)

# zscore data
Xz = (X-np.mean(X)) / np.std(X,ddof=1)
zThresh = 3

# clean data
Xclean = X[Xz<zThresh]

# report number of removed data points
print(f'Original sample size: {N}')
print(f'Cleaned sample size:  {len(Xclean)}')
print(f'Percent data removed: {100*(1-len(Xclean)/N):.2f}%')
Code
# histogram bins using FD rule
y_fd_all = np.histogram(X,bins='fd')
y_fd_clean = np.histogram(Xclean,bins='fd')

# histogram bins using set boundaries
# Note that I create the bin boundaries using k+1 numbers, then input that vector of boundaries into np.histogram
edges = np.linspace(np.min(X),np.max(X),41)
y_40_all = np.histogram(X,bins=edges)
y_40_clean = np.histogram(Xclean,bins=edges)


# plotting the histograms
_,axs = plt.subplots(2,1,figsize=(8,7))
axs[0].plot((y_fd_all[1][:-1]+y_fd_all[1][1:])/2,y_fd_all[0],'ks-',
         label='Pre-cleaned',markersize=11,markerfacecolor=(.6,.6,.6))
axs[0].plot((y_fd_clean[1][:-1]+y_fd_clean[1][1:])/2,y_fd_clean[0],'ko-',
         label='Cleaned',markersize=10,markerfacecolor=(.9,.9,.9))
axs[0].set_title(r'$\bf{A}$)  Histograms using F-D rule')

axs[1].plot((y_40_all[1][:-1]+y_40_all[1][1:])/2,y_40_all[0],'ks-',
         label='Pre-cleaned',markersize=11,markerfacecolor=(.6,.6,.6))
axs[1].plot((y_40_clean[1][:-1]+y_40_clean[1][1:])/2,y_40_clean[0],'ko-',
         label='Cleaned',markersize=10,markerfacecolor=(.9,.9,.9))
axs[1].set_title(r'$\bf{B}$)  Histograms using 40 bins')


# axis adjustments
for a in axs:
  a.legend()
  a.set(xlabel='F value',ylabel='Count',
        xlim=[np.min(X)-.02,np.max(X)+.02],xticks=range(6))

plt.tight_layout()
plt.savefig('dataQC_ex4.png')
plt.show()

R Version

Code
N = 1000
x = rf(df1 = 5, df2 = 100, n = N)

# zscore data
xZ = (x - mean(x)) / sd(x) # sd uses n-1 dof
zThresh = 3

# clean data
xClean = x[xZ < zThresh]

# report number of removed data points:
print(glue("Original sample size: {N}"))
Original sample size: 1000
Code
print(glue("Cleaned sample size: {length(xClean)}"))
Cleaned sample size: 989
Code
print(glue("Percent data removed: {sprintf('%.2f', 100*(1 - length(xClean)/N))}%"))
Percent data removed: 1.10%
Code
# histogram bins using FD rule
edges_fd_all = seq(min(x), max(x), length=nclass.FD(x))
y_fd_all = hist(x, breaks = edges_fd_all, plot=F)

edges_fd_clean = seq(min(xClean), max(xClean), length=nclass.FD(xClean))
y_fd_clean = hist(xClean, breaks = edges_fd_clean, plot=F)

# function to creating dataframe
convert_to_DF = function(y){
    df_ = tibble(
        data_index = y$mids,
        data_value = y$counts
    )
    return (df_)
}

y_fd_all = convert_to_DF(y_fd_all)
y_fd_clean = convert_to_DF(y_fd_clean)

# histogram bins using set boundaries
# Note that I create the bin boundaries using k+1 numbers, then input that 
# vector of boundaries into np.histogram
edges = seq(min(x), max(x), length=41)
y_40_all = hist(x, breaks = edges, plot=F)
y_40_clean = hist(xClean, breaks = edges, plot=F)

# convert to DF:
y_40_all = convert_to_DF(y_40_all)
y_40_clean = convert_to_DF(y_40_clean)

# plotting the histograms:
pA = ggplot() + 
    geom_line(
        data = y_fd_all,
        aes(x = data_index, y = data_value,
            color = "Pre-cleaned",
        ),
        size = .5
    ) +
    geom_point(
        data = y_fd_all,
        aes(x = data_index, y = data_value,
            color = "Pre-cleaned", 
            fill = "Pre-cleaned",
            shape = "Pre-cleaned"
        ),
        size = 5,
        stroke = .2
    ) +
    geom_line(
        data = y_fd_clean,
        aes(x = data_index, y = data_value,
            color = "Cleaned"
        ),
        size = 0.5
    ) + 
    geom_point(
        data = y_fd_clean,
        aes(x = data_index, y = data_value,
            color = "Cleaned",
            fill = "Cleaned",
            shape = "Cleaned"
        ),
        size = 4,
        stroke = .5
    ) + 
    scale_shape_manual(
        name = "",
        values = c(
            "Pre-cleaned" = 22,
            "Cleaned" = 21
        )
    )  +
    scale_color_manual(
        name = "",
        values = c(
            "Cleaned" = "#252525",
            "Pre-cleaned" = "#252525"
        )
    ) + 
    scale_fill_manual(
        name = "",
        values = c(
            "Cleaned" = "#bdbdbd",
            "Pre-cleaned" = "#525252"
        )
    )  +
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        legend.position = c(.8,.9),
        legend.title = element_blank(),
        legend.box.background = element_rect(color="gray", linewidth = 1),
        legend.text = element_text(size = 12),
        legend.key = element_blank(),
        legend.key.width = unit(1, "cm"),
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(fill= "", color = "", shape = "", 
         title = bquote(bold("A)")~"Histogram using F-D rule"),
         y = "Count", x = "F value")

pB = ggplot() + 
    geom_line(
        data = y_40_all,
        aes(x = data_index, y = data_value,
            color = "Pre-cleaned",
        ),
        size = .5
    ) +
    geom_point(
        data = y_40_all,
        aes(x = data_index, y = data_value,
            color = "Pre-cleaned", 
            fill = "Pre-cleaned",
            shape = "Pre-cleaned"
        ),
        size = 5,
        stroke = .2
    ) +
    geom_line(
        data = y_40_clean,
        aes(x = data_index, y = data_value,
            color = "Cleaned"
        ),
        size = 0.5
    ) + 
    geom_point(
        data = y_40_clean,
        aes(x = data_index, y = data_value,
            color = "Cleaned",
            fill = "Cleaned",
            shape = "Cleaned"
        ),
        size = 4,
        stroke = .5
    ) + 
    scale_shape_manual(
        name = "",
        values = c(
            "Pre-cleaned" = 22,
            "Cleaned" = 21
        )
    )  +
    scale_color_manual(
        name = "",
        values = c(
            "Cleaned" = "#252525",
            "Pre-cleaned" = "#252525"
        )
    ) + 
    scale_fill_manual(
        name = "",
        values = c(
            "Cleaned" = "#bdbdbd",
            "Pre-cleaned" = "#525252"
        )
    )  +
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        legend.position = c(.8,.9),
        legend.title = element_blank(),
        legend.box.background = element_rect(color="gray", linewidth = 1),
        legend.text = element_text(size = 12),
        legend.key = element_blank(),
        legend.key.width = unit(1, "cm"),
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(fill= "", color = "", shape = "", 
         title = bquote(bold("B)")~"Histogram using 40 bins"),
         y = "Count", x = "F value")


pExercise4 = pA + pB + plot_layout(ncol=1)
pExercise4

Exercise 5

Python version:

Code
# url reference: https://archive.ics.uci.edu/ml/datasets/Arrhythmia

# import data
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data',
                 usecols = np.arange(9),
                 names   = ['age','sex','height','weight','qrs','p-r','q-t','t','p'])

# inspect
df.head()
Code
# boxplots of raw data
plt.figure(figsize=(10,5))
sns.boxplot(data=df).set(xlabel='Data feature',ylabel='Data value')
plt.show()

Code
# make a copy of the original data matrix
df_z = df.copy()

for col in df_z.columns:
  if not (col=='sex'):
    df_z[col] = (df[col] - df[col].mean()) / df[col].std(ddof=1)

# inspect again
df_z
Code
# box plots of z-scored data
plt.figure(figsize=(10,5))
sns.boxplot(data=df_z).set(xlabel='Data feature',ylabel='Data value')
plt.show()

Code
# Note: this cell combines the previous graphs to make one figure for the book
_,axs = plt.subplots(2,1,figsize=(10,7))
sns.boxplot(data=df,  ax=axs[0]).set(xticks=[],ylabel='Data value')
axs[0].set_title(r'$\bf{A}$)  Raw data')
sns.boxplot(data=df_z,ax=axs[1]).set(xlabel='Data feature',ylabel='Transformed data value')
axs[1].set_title(r'$\bf{B}$)  Z-transformed data')

plt.tight_layout()
plt.show()

R version

Code
# import data
library(readr)
data <- read_csv(
    'http://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data',
    col_select = 1:9,
    col_names = c('age','sex','height','weight','qrs','p-r','q-t','t','p')
)

# inspect
head(data, 5)
# A tibble: 5 × 9
    age   sex height weight   qrs `p-r` `q-t`     t     p
  <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1    75     0    190     80    91   193   371   174   121
2    56     1    165     64    81   174   401   149    39
3    54     0    172     95   138   163   386   185   102
4    55     0    175     94   100   202   380   179   143
5    75     0    190     80    88   181   360   177   103
Code
ggplot(stack(data), aes(x = ind, y = values, fill=ind)) +
    geom_boxplot() + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        legend.position = "none",
        legend.title = element_blank(),
        legend.box.background = element_rect(color="gray", linewidth = 1),
        legend.text = element_text(size = 12),
        legend.key = element_blank(),
        legend.key.width = unit(1, "cm"),
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(fill= "", color = "", shape = "", 
         title = "",
         y = "Data value", x = "Data features")

Code
# scale the data:
cols_ = setdiff(names(data), "sex") # all columns, but 'sex' feature
dataZ = data %>% 
    mutate(across(all_of(cols_), function(x) (x - mean(x))/sd(x)))
# inspecting again
head(dataZ)
# A tibble: 6 × 9
     age   sex  height weight     qrs `p-r`  `q-t`      t       p
   <dbl> <dbl>   <dbl>  <dbl>   <dbl> <dbl>  <dbl>  <dbl>   <dbl>
1  1.73      0  0.641   0.713  0.135  0.844  0.114  0.114  1.20  
2  0.579     1 -0.0320 -0.251 -0.516  0.420  1.01  -0.588 -1.97  
3  0.457     0  0.156   1.62   3.19   0.175  0.563  0.422  0.464 
4  0.518     0  0.237   1.56   0.721  1.04   0.383  0.254  2.05  
5  1.73      0  0.641   0.713 -0.0599 0.576 -0.216  0.198  0.503 
6 -2.03      0  0.0757 -1.03   0.721  0.264 -1.38   0.114  0.0385
Code
# box plots of z-scored
ggplot(stack(dataZ), aes(x = ind, y = values, fill=ind)) +
    geom_boxplot() + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        legend.position = "none",
        legend.title = element_blank(),
        legend.box.background = element_rect(color="gray", linewidth = 1),
        legend.text = element_text(size = 12),
        legend.key = element_blank(),
        legend.key.width = unit(1, "cm") ,
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(fill= "", color = "", shape = "", 
         title = "",
         y = "Data value", x = "Data features")

Code
# Note: this cell combines the previous graphs to make one figure for the book
p_boxPlot_all = ggplot(stack(data), aes(x = ind, y = values, fill=ind)) +
    geom_boxplot() + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        legend.position = "none",
        legend.title = element_blank(),
        legend.box.background = element_rect(color="gray", linewidth = 1),
        legend.text = element_text(size = 12),
        legend.key = element_blank(),
        legend.key.width = unit(1, "cm"),
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(fill= "", color = "", shape = "", 
         title = bquote(bold("A)")~"Raw data"),
         y = "Data value", x = "")
p_boxPlot_all

Code
# box plots of z-scored
p_boxPlot_z = ggplot(stack(dataZ), aes(x = ind, y = values, fill=ind)) +
    geom_boxplot() + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        legend.position = "none",
        legend.title = element_blank(),
        legend.box.background = element_rect(color="gray", linewidth = 1),
        legend.text = element_text(size = 12),
        legend.key = element_blank(),
        legend.key.width = unit(1, "cm"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(fill= "", color = "", shape = "", 
         title = bquote(bold("B)")~"Z-transformed data"),
         y = "Data value", x = "Data features")

p_boxPlot = p_boxPlot_all + p_boxPlot_z + plot_layout(ncol = 1)
p_boxPlot

Exercise 6

Python version:

Code
# remove based on z-score threshold
zThresh = 3.09 # p<.001

df_clean = df.copy()
df_clean[df_z>zThresh]  = np.nan # positive tail
df_clean[df_z<-zThresh] = np.nan # negative tail
Code
# plot
_,axs = plt.subplots(2,1,figsize=(10,7))
sns.boxplot(data=df,ax=axs[0]).set(xticks=[],ylabel='Data value')
axs[0].set_title(r'$\bf{A}$)  Raw data')
sns.boxplot(data=df_clean,ax=axs[1]).set(xlabel='Data feature',ylabel='Data value')
axs[1].set_title(r'$\bf{B}$)  Cleaned data')

plt.tight_layout()
plt.savefig('dataQC_ex6a.png')
plt.show()

Code
# print the means
raw_means = df.mean().values
cleaned_means = df_clean.mean().values

for name,pre,post in zip(df.columns,raw_means,cleaned_means):
  print(f'{name:>6}: {pre:6.2f}  ->  {post:6.2f}')
Code
# compute percent change
pctchange = 100*(cleaned_means-raw_means) / raw_means

# and plot
plt.figure(figsize=(9,4))
plt.plot(pctchange,'ks',markersize=14,markerfacecolor=(.7,.7,.7))
plt.axhline(0,color='k',linewidth=2,zorder=-1)
plt.xticks(range(9),labels=df.columns)
plt.ylabel('Percent')
plt.title('Change in feature means after z-score data rejection',loc='center')
plt.grid()

plt.tight_layout()
plt.savefig('dataQC_ex6b.png')
plt.show()

R version :

Code
# remove based on z-score threshold
zThresh = 3.09 # p<.001

# creating copy:
data_clean = data
# loop over all columns
for (col_ in colnames(dataZ)) {
    data_clean[[col_]][dataZ[[col_]] > zThresh] = NA
    data_clean[[col_]][dataZ[[col_]] < -zThresh] = NA
}
Code
pA = ggplot(stack(data), aes(x = ind, y = values, fill=ind)) +
    geom_boxplot() + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        legend.position = "none",
        legend.title = element_blank(),
        legend.box.background = element_rect(color="gray", linewidth = 1),
        legend.text = element_text(size = 12),
        legend.key = element_blank(),
        legend.key.width = unit(1, "cm"),
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(fill= "", color = "", shape = "", 
         title = "",
         y = "Data value", x = "")

pB = ggplot(stack(data_clean), aes(x = ind, y = values, fill=ind)) +
    geom_boxplot() + 
    theme(
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        
        legend.position = "none",
        legend.title = element_blank(),
        legend.box.background = element_rect(color="gray", linewidth = 1),
        legend.text = element_text(size = 12),
        legend.key = element_blank(),
        legend.key.width = unit(1, "cm"),
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15)
    ) + 
    labs(fill= "", color = "", shape = "", 
         title = "",
         y = "Data value", x = "Data features")

pA + pB + plot_layout(ncol=1)

Code
# print the means
raw_means = data %>% 
    summarize(across(everything(), mean))

clean_means = data_clean %>% 
    summarize(across(everything(), ~mean(.x, na.rm=T)))

for (col_ in colnames(data)) {
  print(glue('{str_pad(col_, 6, "left", " ")}: {sprintf("%6.2f", raw_means[[col_]])}  ->  {sprintf("%6.2f", clean_means[[col_]])}'))
}
   age:  46,47  ->   46,47
   sex:   0,55  ->    0,55
height: 166,19  ->  163,84
weight:  68,17  ->   68,33
   qrs:  88,92  ->   87,25
   p-r: 155,15  ->  159,76
   q-t: 367,21  ->  368,30
     t: 169,95  ->  168,02
     p:  90,00  ->   90,96
Code
# compute percent change
pctchange = 100*(clean_means - raw_means) / raw_means

ggplot(data=stack(pctchange), aes(x = ind, y = values)) + 
    geom_hline(
        yintercept = 0,
        color="black"
    ) + 
    geom_point(
        shape = 22,
        fill = "#969696",
        size=6
    ) + 

    theme_bw() + 
    theme(
        legend.position = "none",
        legend.title = element_blank(),
        legend.box.background = element_rect(color="gray", linewidth = 1),
        legend.text = element_text(size = 12),
        legend.key = element_blank(),
        legend.key.width = unit(1, "cm"),
        axis.line = element_line(colour = "black"),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        title = element_text(size=15),
        plot.title = element_text(hjust = .5)
        
    ) + 
    labs(title = "Change in feature means after z-score data rejection",
         y = "Percent", x = "")